housing <- read.csv("housing.csv")
housing$total_bedrooms[is.na(housing$total_bedrooms)] = median(housing$total_bedrooms , na.rm = TRUE)
housing$mean_bedrooms = housing$total_bedrooms/housing$households
housing$mean_rooms = housing$total_rooms/housing$households
drops = c('total_bedrooms', 'total_rooms')
housing = housing[ , !(names(housing) %in% drops)]
colnames(housing)
## [1] "longitude" "latitude" "housing_median_age"
## [4] "population" "households" "median_income"
## [7] "median_house_value" "ocean_proximity" "mean_bedrooms"
## [10] "mean_rooms"
head(housing)
## longitude latitude housing_median_age population households median_income
## 1 -122.23 37.88 41 322 126 8.3252
## 2 -122.22 37.86 21 2401 1138 8.3014
## 3 -122.24 37.85 52 496 177 7.2574
## 4 -122.25 37.85 52 558 219 5.6431
## 5 -122.25 37.85 52 565 259 3.8462
## 6 -122.25 37.85 52 413 193 4.0368
## median_house_value ocean_proximity mean_bedrooms mean_rooms
## 1 452600 NEAR BAY 1.0238095 6.984127
## 2 358500 NEAR BAY 0.9718805 6.238137
## 3 352100 NEAR BAY 1.0734463 8.288136
## 4 341300 NEAR BAY 1.0730594 5.817352
## 5 342200 NEAR BAY 1.0810811 6.281853
## 6 269700 NEAR BAY 1.1036269 4.761658
library(ggplot2)
library(DataExplorer)
library(ggpubr)
library(ggmap)
columns <- names(housing)
columns <- columns[-8]
DataExplorer::plot_density(housing)
#count_plot <- ggplot(housing, aes(ocean_proximity)) +
# geom_bar(colour="black", fill = "grey") +
# theme_pubclean()
count_plot <- ggplot(housing, aes(x=ocean_proximity,fill=ocean_proximity)) +
geom_bar(colour="black") +
theme_pubclean()
count_plot
library(corrplot)
DataExplorer::plot_correlation(housing)
Największe korelacje między zmiennymi:
library(ggplot2)
ggplot(housing, aes(x=ocean_proximity, y=housing_median_age, fill=ocean_proximity)) +
geom_violin() +
geom_boxplot(width=0.1) +
#scale_fill_brewer(palette="PuOr") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) ->p1
ggplot(housing, aes(x=ocean_proximity, y=mean_rooms,fill=ocean_proximity)) +
geom_violin() +
geom_boxplot(width=0.1) +
#scale_fill_brewer(palette="PuOr") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))->p2
ggplot(housing, aes(x=ocean_proximity, y=mean_bedrooms, fill=ocean_proximity)) +
geom_violin() +
geom_boxplot(width=0.1) +
#scale_fill_brewer(palette="PuOr") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))->p3
ggplot(housing, aes(x=ocean_proximity, y=population, fill=ocean_proximity)) +
geom_violin() +
geom_boxplot(width=0.1) +
#scale_fill_brewer(palette="PuOr") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))->p4
ggplot(housing, aes(x=ocean_proximity, y=households, fill=ocean_proximity)) +
geom_violin() +
geom_boxplot(width=0.1) +
#scale_fill_brewer(palette="PuOr") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))->p5
ggplot(housing, aes(x=ocean_proximity, y=median_income, fill=ocean_proximity)) +
geom_violin() +
geom_boxplot(width=0.1) +
#scale_fill_brewer(palette="PuOr") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))->p6
ggplot(housing, aes(x=ocean_proximity, y=median_house_value, fill=ocean_proximity)) +
geom_violin() +
geom_boxplot(width=0.1) +
#scale_fill_brewer(palette="PuOr") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))->p7
p1
#p2
#p3
p4
p5
p6
p7
us <- c(left = min(housing$longitude), bottom = min(housing$latitude),
right = max(housing$longitude), top = max(housing$latitude))
p <- get_stamenmap(us, zoom = 7, maptype = "toner-lite") %>% ggmap()
p + geom_point(data = housing, aes(x=longitude, y=latitude, color=median_house_value))+
ggtitle("Heatmap of median_house_value")+
theme_light()+
theme(legend.position="bottom", legend.direction = "vertical")
p +
geom_point(data = housing, aes(x=longitude, y=latitude, color=ocean_proximity), alpha=0.5, size=0.5)+
ggtitle("Ocean_proximity on map")+
theme_light()+
theme(legend.position="bottom", legend.direction = "horizontal")
a = dplyr ::filter(housing, ocean_proximity == 'ISLAND')
length(a)
## [1] 10
Podział kategorii zmiennej ocean_proximity i wizualizacja na mapie może wytłumaczyć dziwne wykresy skrzypcowe dla ocean_proximity=ISLAND. Jedynie dziesięć obserwacji ma taką wartość.
ggplot(data = housing, aes(x=households, y=population, color=median_house_value), alpha=0.5, size=0.5)+
geom_point()+
ggtitle("Households and population correlation")+
theme_light()+
stat_smooth(method = "lm", col = "red")+
theme(legend.position="bottom", legend.direction = "vertical")
## `geom_smooth()` using formula 'y ~ x'
ggplot(data = housing, aes(x=mean_rooms, y=mean_bedrooms, color=median_house_value), alpha=0.5, size=0.5)+
geom_point()+
ggtitle("Mean_room and mean_bedroom correlation")+
theme_light()+
stat_smooth(method = "lm", col = "red")+
theme(legend.position="bottom", legend.direction = "vertical")
## `geom_smooth()` using formula 'y ~ x'
ggplot(data = housing, aes(x=median_income, y=median_house_value), alpha=0.5, size=0.5)+
geom_point()+
ggtitle("Median_income and median_house_value correlation")+
theme_light()+
stat_smooth(method = "lm", col = "red")+
theme(legend.position="bottom", legend.direction = "vertical")
## `geom_smooth()` using formula 'y ~ x'
max(housing$median_house_value)
## [1] 500001
Ostatni wykres pokazuje nam, że zmienna celu została obcięta do wartości 500001.